#chargement des librairies
library(randomForest)
## Warning: le package 'randomForest' a été compilé avec la version R 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)
## Warning: le package 'gbm' a été compilé avec la version R 4.2.2
## Loaded gbm 2.1.8.1
library(mgcv)
## Warning: le package 'mgcv' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : nlme
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
library(party)
## Warning: le package 'party' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : grid
## Le chargement a nécessité le package : mvtnorm
## Le chargement a nécessité le package : modeltools
## Le chargement a nécessité le package : stats4
## Le chargement a nécessité le package : strucchange
## Warning: le package 'strucchange' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : zoo
## Warning: le package 'zoo' a été compilé avec la version R 4.2.2
## 
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     as.Date, as.Date.numeric
## Le chargement a nécessité le package : sandwich
## Warning: le package 'sandwich' a été compilé avec la version R 4.2.2
library(electBook) 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
## Warning: le package 'forecast' a été compilé avec la version R 4.2.2
## 
## Attachement du package : 'forecast'
## L'objet suivant est masqué depuis 'package:nlme':
## 
##     getResponse
#chargment de données
df<-read.csv("df_ajout_var_calendar.csv",header=T,stringsAsFactors = T)
df<-df
#View(head(df))
str(df)
## 'data.frame':    455520 obs. of  12 variables:
##  $ DateHeure     : Factor w/ 35040 levels "2021-01-01 00:00:00+00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ region        : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
##  $ conso         : int  8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
##  $ temp          : num  276 277 278 274 275 ...
##  $ heure         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date          : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ period_journey: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ isHolyday     : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weekday       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Weekend       : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Mois          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ year          : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
df$DateHeure<-as.Date(df$DateHeure,format = "%Y-%m-%d %H:%M:%S")
#5 premières lignes
head(df)
##    DateHeure                  region conso     temp heure       date
## 1 2021-01-01    Auvergne-Rhône-Alpes  8576 276.1250     0 2021-01-01
## 2 2021-01-01                    PACA  6395 277.2833     0 2021-01-01
## 3 2021-01-01               Occitanie  5840 278.4167     0 2021-01-01
## 4 2021-01-01                Bretagne  3984 273.9833     0 2021-01-01
## 5 2021-01-01 Bourgogne-Franche-Comté  2671 275.1500     0 2021-01-01
## 6 2021-01-01      Nouvelle-Aquitaine  6512 275.1500     0 2021-01-01
##   period_journey isHolyday weekday Weekend Mois year
## 1              0      True       4   False    1 2021
## 2              0      True       4   False    1 2021
## 3              0      True       4   False    1 2021
## 4              0      True       4   False    1 2021
## 5              0      True       4   False    1 2021
## 6              0      True       4   False    1 2021
#info sur les colonnes
str(df)
## 'data.frame':    455520 obs. of  12 variables:
##  $ DateHeure     : Date, format: "2021-01-01" "2021-01-01" ...
##  $ region        : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
##  $ conso         : int  8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
##  $ temp          : num  276 277 278 274 275 ...
##  $ heure         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date          : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ period_journey: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ isHolyday     : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weekday       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Weekend       : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Mois          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ year          : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
str(df)
## 'data.frame':    455520 obs. of  12 variables:
##  $ DateHeure     : Date, format: "2021-01-01" "2021-01-01" ...
##  $ region        : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
##  $ conso         : int  8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
##  $ temp          : num  276 277 278 274 275 ...
##  $ heure         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date          : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ period_journey: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ isHolyday     : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weekday       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Weekend       : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Mois          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ year          : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
df$DateHeure<-as.Date(df$DateHeure,format = "%Y-%m-%d %H:%M:%S")
#df[['DateHeure']] <- as.POSIXct(df[['DateHeure']],format = "%Y-%m-%d %H:%M:%S")
df=subset(df ,region=="France")
df$Weekend<-as.logical(df$Weekend)
df$Weekend <- as.integer(df$Weekend)
df$isHolyday<-as.logical(df$isHolyday)
df$isHolyday <- as.integer(df$isHolyday)
col=c("DateHeure","conso","temp","heure","period_journey","isHolyday"         
,"weekday","Weekend","Mois")
#selection des variable
df_france=df[col]
#praparation des données
last <- tail(seq(nrow(df_france)), 48)
df_france$conso48 <- c(rep(NA, 48), df_france$conso[-last])
#separataion data
n_test <- 48
n_train<-nrow(df_france)-n_test
df_train <- head(df_france,n_train)
df_test <- tail(df_france,48)
#visualisation
plot(df_france$DateHeure, 
     df_france$conso, type = 'n')
lines(df_train$DateHeure, df_train$conso)
lines(df_test$DateHeure, df_test$conso, col = 2)

#Random Forest
formula_ML <- conso ~ temp + heure + period_journey + isHolyday + 
  weekday + Weekend + Mois
expert_rf <- randomForest(formula_ML, data = na.omit(df_train))
expert_rf_forecast <- predict(expert_rf, newdata=df_test)
mean(expert_rf_forecast)
## [1] 51819.14
print(sqrt(mean((expert_rf_forecast-df_test[,"conso"])**2)))
## [1] 7335.336
print(sqrt(mean((expert_rf_forecast-df_test[,"conso"])**2)))
## [1] 7335.336
#auto-arima sans covariables
ts.train <- ts(df_train, frequency = 7)
ts.test <- ts(ts.train, frequency = 7)
fit1=auto.arima(df_train[,"conso"])
forecast_auto_arima1=forecast(fit1,h=48)
forecast_auto_arima1 <- forecast_auto_arima1$mean
forecast_auto_arima1
## Time Series:
## Start = 34993 
## End = 35040 
## Frequency = 1 
##  [1] 48803.92 48900.33 49027.94 49194.22 49354.01 49517.65 49670.22 49812.47
##  [9] 49941.31 50056.74 50158.85 50248.36 50326.21 50393.47 50451.26 50500.67
## [17] 50542.73 50578.40 50608.56 50633.98 50655.35 50673.27 50688.26 50700.78
## [25] 50711.22 50719.90 50727.11 50733.10 50738.06 50742.17 50745.56 50748.36
## [33] 50750.67 50752.57 50754.14 50755.43 50756.49 50757.36 50758.08 50758.67
## [41] 50759.15 50759.54 50759.86 50760.13 50760.34 50760.52 50760.66 50760.78
print(sqrt(mean((forecast_auto_arima1-df_test[,"conso"])**2)))
## [1] 6313.171
#auto-arima avec covariable

#### objet ts ###################

date_start=as.Date("2021-01-01")
date_end=as.Date("2022-12-31")

#ts.data <- ts(data = data, start =date_start, end =date_end,frequency=48)
ts.train <- ts(df_train, frequency = 7)
ts.test <- ts(df_test, frequency = 7)
fit2=auto.arima(df_train[,"conso"],xreg=as.matrix(df_train[,3:9]))
forecast_auto_arima2<-forecast(fit2,h=48,xreg=as.matrix(df_test[,3:9]))
forecast_auto_arima2 <- forecast_auto_arima2$mean
print(sqrt(mean((forecast_auto_arima2-df_test[,"conso"])**2)))
## [1] 7240.465
library(prophet)
## Warning: le package 'prophet' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : Rcpp
## Le chargement a nécessité le package : rlang
# charger les donnée

# préparer les données pour Prophet
df_prophet=df_france
df_prophet$ds <-df_prophet$DateHeure
df_prophet$y <- df_prophet$conso
df_prophet$holiday <- ifelse(df_prophet$isHolyday  == "yes", 1, 0)

# inclure les variables exogènes
str(df_prophet) 
## 'data.frame':    35040 obs. of  13 variables:
##  $ DateHeure     : Date, format: "2021-01-01" "2021-01-01" ...
##  $ conso         : int  67010 67071 65052 64918 64376 63758 61883 60469 59103 58441 ...
##  $ temp          : num  275 275 275 275 274 ...
##  $ heure         : int  0 0 1 1 2 2 3 3 4 4 ...
##  $ period_journey: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ isHolyday     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Weekend       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Mois          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ conso48       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ds            : Date, format: "2021-01-01" "2021-01-01" ...
##  $ y             : int  67010 67071 65052 64918 64376 63758 61883 60469 59103 58441 ...
##  $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
df_prophet<-df_prophet[,setdiff(colnames(df_prophet),c("DateHeure","conso"))]
summary(df_prophet)
##       temp           heure       period_journey     isHolyday      
##  Min.   :270.3   Min.   : 0.00   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:280.8   1st Qu.: 5.75   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :285.6   Median :11.50   Median :0.5000   Median :0.00000  
##  Mean   :286.0   Mean   :11.50   Mean   :0.8333   Mean   :0.03014  
##  3rd Qu.:291.0   3rd Qu.:17.25   3rd Qu.:1.2500   3rd Qu.:0.00000  
##  Max.   :309.5   Max.   :23.00   Max.   :3.0000   Max.   :1.00000  
##                                                                    
##     weekday         Weekend            Mois           conso48     
##  Min.   :0.000   Min.   :0.0000   Min.   : 1.000   Min.   :29660  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 4.000   1st Qu.:43378  
##  Median :3.000   Median :0.0000   Median : 7.000   Median :50001  
##  Mean   :3.004   Mean   :0.2863   Mean   : 6.526   Mean   :52163  
##  3rd Qu.:5.000   3rd Qu.:1.0000   3rd Qu.:10.000   3rd Qu.:59978  
##  Max.   :6.000   Max.   :1.0000   Max.   :12.000   Max.   :88440  
##                                                    NA's   :48     
##        ds                   y            holiday 
##  Min.   :2021-01-01   Min.   :29660   Min.   :0  
##  1st Qu.:2021-07-02   1st Qu.:43378   1st Qu.:0  
##  Median :2021-12-31   Median :49988   Median :0  
##  Mean   :2021-12-31   Mean   :52153   Mean   :0  
##  3rd Qu.:2022-07-02   3rd Qu.:59969   3rd Qu.:0  
##  Max.   :2022-12-31   Max.   :88440   Max.   :0  
## 
# entraîner le modèle
m <- prophet(df_prophet,yearly.seasonality=TRUE,
weekly.seasonality = TRUE,holidays = df_prophet[df_prophet$holiday == 1,
                                                c("ds", "holiday")])
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# faire des prévisions
future <- make_future_dataframe(m, periods = 1)
forecast_prophet <- predict(m, future)
# afficher les prévisions
plot(m, forecast_prophet)

forecast_prophet<-tail(forecast_prophet$yhat,48)
######## méthodes tslm  ################

#regardons le le modèle tslm en incluant la saisonnalité 

formula_tslm <- conso ~ temp + heure + period_journey + isHolyday + 
  weekday + Weekend + Mois
fit_tslm=tslm(formula_tslm,data=ts.train) 

forecast_tslm<-predict(fit_tslm, newdata =ts.test[,3:9])
print(sqrt(mean((forecast_tslm-df_test[,"conso"])**2)))
## [1] 5613.051
########## methodes par aggrégations des poids des experts##########

# 2. Mixture  ####

library(opera)
## Warning: le package 'opera' a été compilé avec la version R 4.2.2
experts <- cbind(expert_rf_forecast,forecast_auto_arima1,forecast_auto_arima2,
forecast_tslm,forecast_prophet)
colnames(experts) <- c("rf","arima_sans_covariable","arima_avec_covariable",
                       "tslm","prophet")
or <- oracle(df_test$conso,experts, 
             model = "convex", 
             loss.type = "square")

rmse_exp <- apply(experts, 2,
                  function(x){sqrt(mean((x - df_test$conso)^2))})

rmse_exp %>% round(, digits = 0) %>% sort
##                  tslm arima_sans_covariable arima_avec_covariable 
##                  5613                  6313                  7240 
##                    rf               prophet 
##                  7335                 10879
# valeur théorique
M <- mean((df_train$conso - df_train$conso48)^2, na.rm = T)
learning.rate <- (1/M) * sqrt(8*log(ncol(experts))) / nrow(df_test)
agg.online_theoric<- mixture(Y = df_test$conso, 
                             experts = experts,
                             model = 'EWA', 
                             loss.type = "square",
                             loss.gradient = F,
                             parameter=list(eta=learning.rate))
plot(agg.online_theoric)
## Warning in par(def.par, new = FALSE): argument 1 does not name a graphical
## parameter
summary(agg.online_theoric)
## Aggregation rule: EWA 
## Loss function:  squareloss 
## Gradient trick:  FALSE 
## Coefficients: 
##     rf arima_sans_covariable arima_avec_covariable  tslm prophet
##  0.141                  0.28                 0.151 0.422 0.00598
## 
## Number of experts:  5
## Number of observations:  48
## Dimension of the data:  1 
## 
##         rmse  mape
## EWA     6450 0.138
## Uniform 7150 0.153
# optimisation sur les données
agg.online<- mixture(Y = df_test$conso , experts = experts,
                     model = 'EWA', loss.type = "square",
                     loss.gradient = F)
plot(agg.online)
## Warning in par(def.par, new = FALSE): argument 1 does not name a graphical
## parameter
summary(agg.online)
## Aggregation rule: EWA 
## Loss function:  squareloss 
## Gradient trick:  FALSE 
## Coefficients: 
##  rf arima_sans_covariable arima_avec_covariable tslm prophet
##   0                     0                     0    1       0
## 
## Number of experts:  5
## Number of observations:  48
## Dimension of the data:  1 
## 
##         rmse  mape
## EWA     5680 0.117
## Uniform 7150 0.153